library(dplyr)
library(ggplot2)
library(lmtest)
library(lubridate)
library(limma)
library(tidyr)
library(plotly)
library(gridExtra)
library(data.table)
library(formattable)

#Data Files and prep work
source("../../../lib/DataProccess.R")
source("../../../lib/NormFuncs.R")
source("../../../lib/OutlierDetectionFuncs.R")
source("../../../lib/DataPathName.R")
BaseDir <- params$BaseDir#get the root of the directory where the data is stored

The steps to this analysis are: Remove outliers Remove trend normalize residual find constant variance

First we look at Madison data

LIMSFullDF <- ParseData(LIMSWastePath(BaseDir))%>%
  filter(Site == "Madison")%>%
  select(Date, Site, N1, BCoV , N2 , PMMoV,
         FlowRate,temperature,equiv_sewage_amt)
ErrorMarkedDF <- LIMSFullDF%>%#
    mutate(FlagedOutliers = IdentifyOutliers(N1, Action = "Flag"),
           #Manual flagging that method misses due to boundary effect with binning
           FlagedOutliers = ifelse(Date == mdy("01/26/2021") | 
                                   Date == mdy("01/27/2021") | 
                                   Date == mdy("01/23/2022"),
                                   TRUE, FlagedOutliers),
           NoOutlierVar = ifelse(FlagedOutliers, NA, N1))

#Split N1 into outlier and non outlier for next ggplot
OutLierPlotDF <- ErrorMarkedDF%>%
  mutate(OutlierN1 = ifelse(FlagedOutliers,N1,NA))%>%
           mutate(N1 := NoOutlierVar)

OutLierPlotObject <- OutLierPlotDF%>%
  filter(!(is.na(N1)&is.na(OutlierN1)))%>%
  ggplot(aes(x=Date))+#Data depends on time 
  geom_line(aes(y=N1,
                color="N1", 
                info = N1))+#compares Var to Cases
  geom_point(aes(y=OutlierN1,
                 color="OutlierN1",
                 info = OutlierN1))

ggplotly(OutLierPlotObject,tooltip=c("info","Date"))
UpdatedDF <- ErrorMarkedDF%>%
  select(-N1)%>%
  rename(N1 = NoOutlierVar)
UpdatedDF$loessN1 <- loessFit(y=(UpdatedDF$N1), 
                      x=UpdatedDF$Date, #create loess fit of the data
                      span=.05, #span of .2 seems to give the best result, not rigorously chosen
                      iterations=2)$fitted#2 iterations remove some bad patterns

SLDLoessGraphic <- UpdatedDF%>%
  NoNa("loessN1")%>%
  ggplot(aes(x=Date))+
  geom_line(aes(y=N1,
                color="N1",
                info = N1),
            alpha=.2)+
  geom_line(aes(y = loessN1, 
                color="loessN1" ,
                info = loessN1))+
  geom_hline(yintercept = (5e4))+
  geom_hline(yintercept = (1e6))


ggplotly(SLDLoessGraphic,tooltip=c("info","Date"))
DetrendedDF <- UpdatedDF%>%
  mutate(DetrendedN1 = N1 - loessN1)


DetrendLoessGraphic <- DetrendedDF%>%
  NoNa("DetrendedN1")%>%
  ggplot(aes(x=Date))+
  geom_line(aes(y=DetrendedN1,
                color="DetrendedN1",
                info = DetrendedN1))


ggplotly(DetrendLoessGraphic,tooltip=c("info","Date"))
mean(DetrendedDF$DetrendedN1, na.rm=TRUE)
## [1] 12278.31
library(tseries)
NormRestruct <- function(Norm){
  return(mean(Norm,na.rm=TRUE)/Norm)
}
TestNorm <- function(DF,Base,Norm){
  noNADF <- DF%>%
    NoNa(Norm)%>%
    mutate(SevDay = rollapply(!!sym(Norm),7,mean, partial = TRUE),
           SevDayBase = rollapply(!!sym(Base),7,mean, partial = TRUE))
  
  
  ResidDetrendLoessGraphic <- noNADF%>%
  ggplot(aes(x=Date))+
  geom_point(aes(y=!!sym(Base),
                color=Base),
             size = .5)+
  geom_point(aes(y=!!sym(Norm),
                color=Norm),
             size = .5)+
  geom_line(aes(y=SevDayBase,
                color="SevDayBase"))+
  geom_line(aes(y=SevDay,
                color="SevDay"))

  print(adf.test(noNADF[[Norm]]))
  ggplotly(ResidDetrendLoessGraphic,tooltip=c("y","Date"))
}


ResidDetrendedDF <- DetrendedDF%>%
  mutate(N1NormedResid = DetrendedN1*NormRestruct(loessN1),
         sqrtNormedResid = DetrendedN1*NormRestruct(sqrt(loessN1)),
         logNormedResid = DetrendedN1*NormRestruct(log(loessN1)),
         xlogNormedResid = DetrendedN1*NormRestruct(loessN1*log(loessN1)),
         WeirdNormedResid = DetrendedN1*NormRestruct(sqrt(loessN1)*log(loessN1)))


TestNorm(ResidDetrendedDF,"DetrendedN1","DetrendedN1")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  noNADF[[Norm]]
## Dickey-Fuller = -9.6536, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
TestNorm(ResidDetrendedDF,"DetrendedN1","N1NormedResid")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  noNADF[[Norm]]
## Dickey-Fuller = -9.372, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
TestNorm(ResidDetrendedDF,"DetrendedN1","sqrtNormedResid")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  noNADF[[Norm]]
## Dickey-Fuller = -9.795, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
ResidDetrendedDF <- DetrendedDF%>%
  mutate(VarCompN1 = DetrendedN1**2,
         N1NormedResid = VarCompN1*NormRestruct(loessN1^2),
         sqrtNormedResid = VarCompN1*NormRestruct(loessN1),
         logNormedResid = VarCompN1*NormRestruct(log(loessN1)^2),
         xlogNormedResid = VarCompN1*NormRestruct(loessN1^2*log(loessN1)^2),
         WeirdNormedResid = VarCompN1*NormRestruct(loessN1*log(loessN1)^2))


TestNorm(ResidDetrendedDF,"VarCompN1","VarCompN1")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  noNADF[[Norm]]
## Dickey-Fuller = -4.2406, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
TestNorm(ResidDetrendedDF,"VarCompN1","N1NormedResid")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  noNADF[[Norm]]
## Dickey-Fuller = -5.4619, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
TestNorm(ResidDetrendedDF,"VarCompN1","sqrtNormedResid")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  noNADF[[Norm]]
## Dickey-Fuller = -4.5547, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary

ViewTwo: Shape of the Var series

ShapeDF <- ResidDetrendedDF%>%
  filter(!is.na(log(VarCompN1)),!is.na(log(N1)))
ShapeDF$GamVar <- mgcv::gam(
            formula = log(VarCompN1) ~ s(log(N1), bs = "cs"), data = ShapeDF)$fitted

ShapeDF%>%
    NoNa("VarCompN1")%>%
    ggplot(aes(x = log(N1)))+
    geom_point(aes(y=log(VarCompN1),
                color="VarCompN1"),
             size = .5)+
    geom_line(aes(y=GamVar,
                color="GamVar"))+
  geom_abline(slope = 1.62603,intercept = 2.11644,color="Blue")+
  geom_vline(xintercept = log(5e4))+
  geom_vline(xintercept = log(1e6))+
  facet_wrap(~Site)

AllSiteDF <- ParseData(LIMSWastePath(BaseDir))
SiteDFList <- split(AllSiteDF,AllSiteDF$Site)

loessFitandDetrend <- function(DF){
  spanC = min(.05*393/length(DF$N1),.3)
  DF$loessN1 <- loessFit(y=DF$N1, 
                      x=DF$Date, #create loess fit of the data
                      span=spanC,
                      iterations=2)$fitted#2 iterations remove some bad patterns
  RetDF <- DF%>%
    mutate(DetrendedN1 = N1 - loessN1,
           VarN1 = DetrendedN1^2,
           SevDay = rollapply(VarN1,7,mean, partial = TRUE))
  return(RetDF)
}
TempListDF <- lapply(SiteDFList,loessFitandDetrend)

FullDetrendedData <- rbindlist(TempListDF)%>%
  NoNa("N1","VarN1")%>%
  filter(VarN1>1)
FullDetrendedData$loessVar <- loessFit(y=log(FullDetrendedData$VarN1), 
                      x=log(FullDetrendedData$N1), #create loess fit of the data
                      span=.3,
                      iterations=2,
                      method = "loess")$fitted#2 iterations remove some bad patterns

FullDetrendedData$GamVar <- mgcv::gam(
            formula = log(VarN1) ~ s(log(N1), bs = "cs"),data = FullDetrendedData)$fitted



FullDetrendedData%>%
    NoNa("VarN1")%>%
    ggplot(aes(x = log(N1)))+
    geom_point(aes(y=log(VarN1),
                color="VarN1"),
             size = 2,alpha=.1)+
  geom_abline(slope = 1.62603,intercept = 2.11644,color="Blue")+
  geom_vline(xintercept = log(5e4))+
  geom_vline(xintercept = log(1e6))+
  geom_line(aes(y=GamVar))

RemoveLowDF <- FullDetrendedData%>%
  filter(N1>5e4,N1<1e6)

RemoveLowDF%>%
  ggplot(aes(x=log(N1),y=GamVar))+
  geom_point()+
  geom_abline(slope = 1.635322,intercept = 2.009892)

  #geom_abline(slope = 1.842089,intercept = -0.534551)
  
#1.862581,   -0.523034
summary(lm(GamVar~log(N1),data=RemoveLowDF))
## 
## Call:
## lm(formula = GamVar ~ log(N1), data = RemoveLowDF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34044 -0.15823  0.04028  0.13855  0.46399 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.009892   0.046776   42.97   <2e-16 ***
## log(N1)     1.635322   0.003819  428.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1935 on 3845 degrees of freedom
## Multiple R-squared:  0.9795, Adjusted R-squared:  0.9795 
## F-statistic: 1.833e+05 on 1 and 3845 DF,  p-value: < 2.2e-16
summary(lm(log(VarN1)~log(N1),data=RemoveLowDF))
## 
## Call:
## lm(formula = log(VarN1) ~ log(N1), data = RemoveLowDF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.3053  -1.1762   0.3949   1.5645   8.5448 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.11644    0.61003   3.469 0.000527 ***
## log(N1)      1.62603    0.04981  32.644  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.524 on 3845 degrees of freedom
## Multiple R-squared:  0.217,  Adjusted R-squared:  0.2168 
## F-statistic:  1066 on 1 and 3845 DF,  p-value: < 2.2e-16
RemoveLowDF%>%
  ggplot(aes(x=log(N1),y=log(VarN1) - 1.842089*log(N1)+0.534551))+
  geom_point()+
  geom_abline(slope = 1.842089,intercept = -0.534551)

FullDetrendedData%>%
  #filter(log(N1)>11)%>%
    NoNa("VarN1")%>%
    ggplot(aes(x = N1))+
  scale_x_log10()+
  geom_point(aes(y=GamVar))

FullDetrendedData%>%
  #filter(log(N1)>11)%>%
    ggplot(aes(x = N1))+
  scale_x_log10()+
  geom_point(aes(y=GamVar-lag(GamVar)),alpha=.1,size=.5)